Set Up and Prep

library(compositions)
library(zCompositions)
library(ape)
library(plyr)
library(dplyr)
library(ggplot2)
library(gplots)
library(lme4)
library(tidyr)
library(vegan)
library(scales)
library(grid)
library(reshape2)
library(phyloseq)
library(ALDEx2)
library(gridExtra)
library(stringr)

m.g.colors <- c( "#999999", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#CBD588", "#5F7FC7", "orange","#DA5724", "#508578", "#CD9BCD",
   "#AD6F3B", "#673770","#D14285", "#652926", "#C84248", 
  "#8569D5", "#5E738F","#D1A33D", "#8A7C64", "#599861" )

more.colors <- c("indianred1", "steelblue3",  "skyblue1", "mediumorchid","royalblue4", "olivedrab3",
   "pink", "#FFED6F", "mediumorchid3", "ivory2", "tan1", "aquamarine3", "#C0C0C0",
    "mediumvioletred", "#999933", "#666699", "#CC9933", "#006666", "#3399FF",
   "#993300", "#CCCC99", "#666666", "#FFCC66", "#6699CC", "#663366", "#9999CC", "#CCCCCC",
   "#669999", "#CCCC66", "#CC6600", "#9999FF", "#0066CC", "#99CCCC", "#999999", "#FFCC00",
   "#009999", "#FF9900", "#999966", "#66CCCC", "#339966", "#CCCC33", "#EDEDED"
)

theme_set(theme_classic(base_size=12, base_family="Avenir"))
taxfile="ACS.cons.taxonomy"
sharedfile="ACS.shared"
mdf <- read.csv("fakenames.mdf.csv", row.names=1, header=TRUE)
mdf

Standard import of mothur data

mothur_data <- import_mothur(mothur_constaxonomy_file = taxfile, mothur_shared_file = sharedfile)

moth_merge <- merge_phyloseq(sample_data(mdf), tax_table(mothur_data), otu_table(mothur_data))

colnames(tax_table(moth_merge)) <- c("Kingdom", "Phylum", "Class", 
  "Order", "Family", "Genus")

waterdat <- moth_merge %>%
  subset_taxa(
    Kingdom %in% c("Bacteria", "Archaea") &
    Family  != "mitochondria" &
    Class   != "Chloroplast"
  )

waterdat
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 47725 taxa and 358 samples ]
## sample_data() Sample Data:       [ 358 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 47725 taxa by 6 taxonomic ranks ]

Clean data by removin singletons and pruning samples with a depth < 10000.

otu.table <- otu_table(waterdat)
#remove singleton reads
otu.table[otu.table==1] <- 0

acsdat.clean <- merge_phyloseq(otu_table(otu.table, taxa_are_rows=TRUE), tax_table(waterdat), sample_data(waterdat)) %>% prune_samples(sample_sums(.) > 10000,.) %>% prune_taxa(taxa_sums(.) > 0,.)

acsdat.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 18583 taxa and 344 samples ]
## sample_data() Sample Data:       [ 344 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 18583 taxa by 6 taxonomic ranks ]
otu.table <- data.frame(otu_table(acsdat.clean))
tax.table <- data.frame(tax_table(acsdat.clean))
mdf <- data.frame(sample_data(acsdat.clean))
transform.clr <- function(otu.df){
  d.1 <- data.frame(otu.df[which(rowSums(otu.df) > 10),], 
    check.names=F)
  d.czm <- t(cmultRepl(t(d.1), label=0, method="CZM"))
  d.clr <- apply(d.czm, 2, function(x){log(x) - mean(log(x))})
  return(d.clr)
}

My favorite ordinations use a CLR transform of the data. We’ll also do some that rely on samples having an even depth later in the document.

otu.clr.table <- transform.clr(otu.table)
## No. corrected values:  10092

Principle Components Analysis (PCA)

The prcomp implementation of PCA expects your samples to be rows and your columns to be OTUs. It returns several things of interest… the variable rotation is the matrix of OTU loadings. The variable x contains your sample compositions multiplied by the rotational matrix.

The data is already scaled as much as we want it to be, but you should center around zero. This feels counter-intuitive because we just applied a centered log ratio transform, but centering the data now will make it easier to interpret the loading variables of the PCA at the end.

d.pcx <- prcomp(t(otu.clr.table), scale=FALSE, center=TRUE)


d.mvar <- sum(d.pcx$sdev^2)
PC1.label <- paste("PC1: ", percent(sum(d.pcx$sdev[1]^2)/d.mvar, accuracy=0.1))
PC2.label <- paste("PC2: ", percent(sum(d.pcx$sdev[2]^2)/d.mvar, accuracy=0.1))
PC3.label <- paste("PC3: ", percent(sum(d.pcx$sdev[3]^2)/d.mvar, accuracy=0.1))
print(c(PC1.label, PC2.label, PC3.label))
## [1] "PC1:  14.8%" "PC2:  11.8%" "PC3:  5.8%"

The total variance accounted for in the first three axes is 32.4% percent. That’s pretty good.

pcx.importance <- summary(d.pcx)$importance
#extract the percent variance like so:
percent.pcx <- pcx.importance[2,]

barplot(percent.pcx[1:20])

This scree plot shows us the drop off point for variables contributing to the PCA. Here it looks like there is only meaningful gain out to PC3. Most information is in PC1 and PC2.

Visualizations

To visualize, extract the rotated data (x) and make it a dataframe. We can then pull variables to use as symbology from your metadata file.

pca.points <- data.frame(d.pcx$x)
pca.points$site <- mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- mdf[as.vector(row.names(pca.points)),"date"]

#make a month variable
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))

ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).

I like to add quadrant lines to make interpretation a little easier.

ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8)) + geom_hline(yintercept=0, linetype=2, color="gray", alpha=0.7)+ geom_vline(xintercept=0, linetype=2, color="gray", alpha=0.7)
## Warning: Removed 1 rows containing missing values (geom_point).

In this dataset, it’s not necessary to display variation on the third PC axis, but if you wanted to do it, you would just switch out the PC1 and PC3 variables like so.

ggplot(pca.points, aes(x=PC2, y=PC3, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC3.label) + xlab(PC2.label) + scale_shape_manual(values=c(1,2,7,8)) + geom_hline(yintercept=0, linetype=2, color="gray", alpha=0.7)+ geom_vline(xintercept=0, linetype=2, color="gray", alpha=0.7)
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot doesn’t have direct support for plotting in 3D, but plotly and gg3d do. Here’s how to install and use those.

gg3D must be downloaded directly from github. Devtools will display a message in your console asking you to update packages. I don’t usually update packages through this tool unless installing the package I want doesn’t work the first time.

devtools::install_github("AckerDWM/gg3d")
library(gg3D)

ggplot(pca.points, aes(x=PC1, y=PC2, z=PC3, col=site.type, shape=community)) + theme_void() + axes_3D() + stat_3D() + scale_color_manual(values=m.g.colors) + ylab(PC3.label) + xlab(PC2.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).

This is somewhat nice because it’s easy to see that the clusters are well separated, but unfortunately there are no axis labels!

Here’s how to do it in plotly. You can install plotly the regular way using install.packages().

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(x=pca.points$PC1, y=pca.points$PC2, z=pca.points$PC3, type="scatter3d", mode="markers", color=pca.points$site.type, symbol=pca.points$community)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: The following are not valid symbol codes:
## 'NA'
## Valid symbols include:
## '0', 'circle', '100', 'circle-open', '200', 'circle-dot', '300', 'circle-open-dot', '1', 'square', '101', 'square-open', '201', 'square-dot', '301', 'square-open-dot', '2', 'diamond', '102', 'diamond-open', '202', 'diamond-dot', '302', 'diamond-open-dot', '3', 'cross', '103', 'cross-open', '203', 'cross-dot', '303', 'cross-open-dot', '4', 'x', '104', 'x-open', '204', 'x-dot', '304', 'x-open-dot', '5', 'triangle-up', '105', 'triangle-up-open', '205', 'triangle-up-dot', '305', 'triangle-up-open-dot', '6', 'triangle-down', '106', 'triangle-down-open', '206', 'triangle-down-dot', '306', 'triangle-down-open-dot', '7', 'triangle-left', '107', 'triangle-left-open', '207', 'triangle-left-dot', '307', 'triangle-left-open-dot', '8', 'triangle-right', '108', 'triangle-right-open', '208', 'triangle-right-dot', '308', 'triangle-right-open-dot', '9', 'triangle-ne', '109', 'triangle-ne-open', '209', 'triangle-ne-dot', '309', 'triangle-ne-open-dot', '10', 'triangle-se', '110', 'triangle-se-open', '210', 'triangle-se-dot', '310', 'triangle-se-open-dot', '11', 'triangle-sw', '111', 'triangle-sw-open', '211', 'triangle-sw-dot', '311', 'triangle-sw-open-dot', '12', 'triangle-nw', '112', 'triangle-nw-open', '212', 'triangle-nw-dot', '312', 'triangle-nw-open-dot', '13', 'pentagon', '113', 'pentagon-open', '213', 'pentagon-dot', '313', 'pentagon-open-dot', '14', 'hexagon', '114', 'hexagon-open', '214', 'hexagon-dot', '314', 'hexagon-open-dot', '15', 'hexagon2', '115', 'hexagon2-open', '215', 'hexagon2-dot', '315', 'hexagon2-open-dot', '16', 'octagon', '116', 'octagon-open', '216', 'octagon-dot', '316', 'octagon-open-dot', '17', 'star', '117', 'star-open', '217', 'star-dot', '317', 'star-open-dot', '18', 'hexagram', '118', 'hexagram-open', '218', 'hexagram-dot', '318', 'hexagram-open-dot', '19', 'star-triangle-up', '119', 'star-triangle-up-open', '219', 'star-triangle-up-dot', '319', 'star-triangle-up-open-dot', '20', 'star-triangle-down', '120', 'star-triangle-down-open', '220', 'star-triangle-down-dot', '320', 'star-triangle-down-open-dot', '21', 'star-square', '121', 'star-square-open', '221', 'star-square-dot', '321', 'star-square-open-dot', '22', 'star-diamond', '122', 'star-diamond-open', '222', 'star-diamond-dot', '322', 'star-diamond-open-dot', '23', 'diamond-tall', '123', 'diamond-tall-open', '223', 'diamond-tall-dot', '323', 'diamond-tall-open-dot', '24', 'diamond-wide', '124', 'diamond-wide-open', '224', 'diamond-wide-dot', '324', 'diamond-wide-open-dot', '25', 'hourglass', '125', 'hourglass-open', '26', 'bowtie', '126', 'bowtie-open', '27', 'circle-cross', '127', 'circle-cross-open', '28', 'circle-x', '128', 'circle-x-open', '29', 'square-cross', '129', 'square-cross-open', '30', 'square-x', '130', 'square-x-open', '31', 'diamond-cross', '131', 'diamond-cross-open', '32', 'diamond-x', '132', 'diamond-x-open', '33', 'cross-thin', '133', 'cross-thin-open', '34', 'x-thin', '134', 'x-thin-open', '35', 'asterisk', '135', 'asterisk-open', '36', 'hash', '136', 'hash-open', '236', 'hash-dot', '336', 'hash-open-dot', '37', 'y-up', '137', 'y-up-open', '38', 'y-down', '138', 'y-down-open', '39', 'y-left', '139', 'y-left-open', '40', 'y-right', '140', 'y-right-open', '41', 'line-ew', '141', 'line-ew-open', '42', 'line-ns', '142', 'line-ns-open', '43', 'line-ne', '143', 'line-ne-open', '44', 'line-nw', '144', 'line-nw-open

Plotly is really a world unto itself and the data formatting is in a very different style from ggplot.

The interactiveness of the widget is great, but as you might imagine, not ideal for making figures for publications. Dive into formatting in plotly if you want to, but you will probably not need to.

Biplots

Biplots display your loadings and rotated data at the same time. They can be a good way to visually explore the contribution of different OTUs to variation between your samples.

PCAs made from many many columns (many many OTUs) don’t usually make readable biplots without a lot of filtering. I’ll show you how to do that, but the best use of biplots is definitely for PCAs made at lower taxonomic resolution (Order level or above for bacteria) or with a subset of columns.

Here’s a biplot with everything:

biplot(x=d.pcx$x, y=d.pcx$rotation, var.axes=TRUE, col=c("black", "#56B4E9"))

How gross! We can improve this.

#change sample labels to site labels
sample.labels <- mdf[row.names(d.pcx$x),"site"]

#shrink text by changing cex. Text size is multiplied by this factor, so < 1 will shrink it.

biplot(x=d.pcx$x, y=d.pcx$rotation, var.axes=TRUE, col=c("black", "#56B4E9"), xlabs=sample.labels, cex=0.5)

Make a subset of the rotational matrix to remove OTUs that don’t contirbute strongly to either axis. This threshold is arbitrary. Choose based on the distribution of values.

summary(d.pcx$rotation[,"PC1"])
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.1068653 -0.0039681 -0.0018742  0.0000000  0.0008445  0.0673561
summary(d.pcx$rotation[,"PC2"])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.108679 -0.002995 -0.001034  0.000000  0.001104  0.138112
pc1q <- quantile(d.pcx$rotation[,"PC1"], probs=c(0.01, 0.03, 0.05, 0.1, 0.9, 0.95, 0.97, 0.99))
pc2q <- quantile(d.pcx$rotation[,"PC2"], probs=c(0.01, 0.03, 0.05, 0.1, 0.9, 0.95, 0.97, 0.99))
pc1q
##           1%           3%           5%          10%          90%          95% 
## -0.028171852 -0.015966457 -0.011441280 -0.007374515  0.010818191  0.022211952 
##          97%          99% 
##  0.032403087  0.047582507
pc2q
##           1%           3%           5%          10%          90%          95% 
## -0.030723456 -0.019042811 -0.013598755 -0.007774894  0.007995538  0.018018425 
##          97%          99% 
##  0.025846880  0.046757499

Let’s do the top and bottom 1% for both axes.

rotation.sub <- data.frame(d.pcx$rotation)[,c("PC1", "PC2")]
dim(rotation.sub)
## [1] 7103    2
rotation.sub <- rotation.sub[which(rotation.sub$PC1 <= pc1q["1%"] | rotation.sub$PC1 >= pc1q["99%"] | rotation.sub$PC2 <= pc2q["1%"] | rotation.sub$PC2 >= pc2q["99%"]),]

dim(rotation.sub)
## [1] 285   2

285 taxa is really still a lot! It will slightly improve the chart.

biplot(x=d.pcx$x, y=rotation.sub, var.axes=TRUE, col=c("black", "#56B4E9"), xlabs=sample.labels, cex=0.5)

Not so helpful.

Let’s try one at a higher taxonomic level. Let’s look at phylum contributions.

Agglomerate by taxa before you use a CLR transform.

acs.phylum <- acsdat.clean %>% tax_glom(taxrank="Phylum")

phylum.otu.table <- data.frame(otu_table(acs.phylum))
phylum.clr.table <- transform.clr(phylum.otu.table)
## No. corrected values:  39
dim(phylum.clr.table)
## [1]  30 344

As you can see it is a much smaller table.

d.pcx <- prcomp(t(phylum.clr.table), scale=FALSE, center=TRUE)


d.mvar <- sum(d.pcx$sdev^2)
PC1.label <- paste("PC1: ", percent(sum(d.pcx$sdev[1]^2)/d.mvar, accuracy=0.1))
PC2.label <- paste("PC2: ", percent(sum(d.pcx$sdev[2]^2)/d.mvar, accuracy=0.1))
PC3.label <- paste("PC3: ", percent(sum(d.pcx$sdev[3]^2)/d.mvar, accuracy=0.1))
print(c(PC1.label, PC2.label, PC3.label))
## [1] "PC1:  25.2%" "PC2:  15.2%" "PC3:  12.3%"

The total variance accounted for in the first three axes is 52.7% percent. The higher value isn’t so surprising, since we went from >7000 columns to only 30!

pcx.importance <- summary(d.pcx)$importance
#extract the percent variance like so:
percent.pcx <- pcx.importance[2,]

barplot(percent.pcx[1:20])

This scree plot shows that there is measurable gain in variance out to PC4.

The same basic visualization again, just to compare to the OTU based plot.

pca.points <- data.frame(d.pcx$x)
pca.points$site <- mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- mdf[as.vector(row.names(pca.points)),"date"]

#make a month variable
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))

ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).

Interesting, yeah? Even though this PCA accounted for much more variance than the one based on OTU composition, hospital and residential types are inseparable in this chart. So we can conclude that they have very different OTU composition, but at the phylum level they are similar. In contrast, the industry and municipal Phandolin sites are different at the phylum level.

Here’s a biplot:

#change sample labels to site labels
sample.labels <- mdf[row.names(d.pcx$x),"site"]

#shrink text by changing cex. Text size is multiplied by this factor, so < 1 will shrink it.

biplot(x=d.pcx$x, y=d.pcx$rotation, var.axes=TRUE, col=c("black", "#56B4E9"), xlabs=sample.labels, cex=0.5)

It’s so far much more readable. We can make it a little easier still be replacing the OTU labels with the names of the phyla.

tax.labels <- tax.table[row.names(d.pcx$rotation),"Phylum"]

# Zoom in on the chart a little, change colors.

biplot(x=d.pcx$x, y=d.pcx$rotation, var.axes=TRUE, col=c("#333333", "#009E73"), xlabs=sample.labels, cex=c(0.5,0.6), ylabs=tax.labels, ylim=c(-10,7), xlim=c(-10, 12))

We can remove some loadings we don’t care about, like extremely rare phyla. My selection is somewhat arbitrary. You could base this selection off of major phyla that you identified in a different section of your analysis, or perhaps if you studying pathogens, you could limit it to pathogens.

phyla.tax <- tax.table[row.names(d.pcx$rotation),]
major.phyla <- c("Proteobacteria", "Firmicutes", "Bacteroidetes", "Fusobacteria", "Actinobacteria", "Verrucomicrobia", "Deinococcus-Thermus", "Spirochaetes", "Synergistetes", "Chloroflexi", "Acidobacteria", "Chlamydiae", "Planctomycetes.")
major.phyla.otus <- row.names(phyla.tax[which(phyla.tax$Phylum %in% major.phyla),])

major.phyla.otus
##  [1] "Otu00001" "Otu00004" "Otu00005" "Otu00007" "Otu00021" "Otu00030"
##  [7] "Otu00102" "Otu00214" "Otu00339" "Otu00410" "Otu00966" "Otu01206"
#expand graph a little bit.

biplot(x=d.pcx$x, y=d.pcx$rotation[major.phyla.otus,], var.axes=TRUE, col=c("#333333", "#009E73"), xlabs=sample.labels, cex=c(0.5,0.6), ylabs=tax.table[major.phyla.otus,"Phylum"], ylim=c(-10,7), xlim=c(-10, 12), expand=1.3)

This will be easier to see if you save it as a higher resolution graphic.

Larger Version

Examining the Loadings

Of course you probably care about the OTU differences in the first PCA since there were some very clear clusters of residential vs. hospital type points. While it’s difficult to explore those in a visual way, we can look directly at the loadings for the PCA.

Here’s that object again.

d.pcx <- prcomp(t(otu.clr.table), scale=FALSE, center=TRUE)

d.mvar <- sum(d.pcx$sdev^2)
PC1.label <- paste("PC1: ", percent(sum(d.pcx$sdev[1]^2)/d.mvar, accuracy=0.1))
PC2.label <- paste("PC2: ", percent(sum(d.pcx$sdev[2]^2)/d.mvar, accuracy=0.1))
PC3.label <- paste("PC3: ", percent(sum(d.pcx$sdev[3]^2)/d.mvar, accuracy=0.1))

pca.points <- data.frame(d.pcx$x)
pca.points$site <- mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- mdf[as.vector(row.names(pca.points)),"date"]
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))

ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).

Looking at the graph, if we want to separate hospital from residential, we should look at which OTUs cause a sign change across the Y-axis (PC2).

d.loadings <- data.frame(d.pcx$rotation)[,c("PC1", "PC2")]

pc2.otus <- d.loadings$PC2
names(pc2.otus) <- row.names(d.loadings)

top10.negative <- sort(pc2.otus)[1:10]
top10.positive <- sort(pc2.otus, decreasing = TRUE)[1:10]

hospital.associated.taxa <- tax.table[names(top10.negative),]
residential.associated.taxa <- tax.table[names(top10.positive),]

hospital.associated.taxa$PC2.Weight <- top10.negative
residential.associated.taxa$PC2.Weight <- top10.positive
hospital.associated.taxa
residential.associated.taxa

We could now visualize these as a biplot if we wanted.

PC2.selection <- c(names(top10.positive), names(top10.negative))

tax.labels <- tax.table[PC2.selection,"Genus"]

biplot(x=d.pcx$x, y=d.pcx$rotation[PC2.selection,], var.axes=TRUE, col=c("#333333", "#009E73"), xlabs=sample.labels, cex=c(0.5,0.6), ylabs=tax.labels, expand = 1.5, ylim = c(-50, 70), xlim = c(-40, 50))

Again, this will be easier to look at if you save it as a file.

Top 20 OTUs

You can also display the relative contribution of each OTU in something like a barchart, but this is pretty misleading. The reader will probably be tempted to interpret the proportions as having anything to do with abundance, which they don’t. The taxa identified using ordination are usually not the most abundant taxa in a sample type, but are differentially abundant with other sample types.

Variants of PCA

For sparse datasets like MiSeq data, PCAs often account for a really disappointing amount of variance. The standard PCA also doesn’t fare well for binary data, where abundance is not interpretable (say for a prey composition dataset.). To address the first case, I suggest the package mixOmics. Installing mixOmics requires Bioconductor, a bioinformatics software manager for R. If you don’t have Bioconductor, install it by running this command:

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(version = "3.12")

To install mixOmics, run this command:

BiocManager::install("mixOmics")
library(mixOmics)
## 
## Loaded mixOmics 6.12.2
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us:  citation('mixOmics')

Sparse PCA

mixOmics has a suite of tools to help you prepare your data, incuding a CLR transform method, but you can also use the CLR transform object you already have.

In this implementation, you must choose the number of variables to keep for each of your principle components. This choice is really up to you since this is mostly an exploratory analysis. Let’s start with 10 OTUs on three axes.

mixOmics uses the same matrix orientation as prcomp; samples=rows, OTUs=columns

#ncomp = number of PCA components
#keepX = number of variables to keep in each axis
d.spca <- spca(t(otu.clr.table), ncomp=3, center=TRUE, scale=FALSE, keepX=c(10,10,10))

d.spca
## 
## Call:
##  spca(X = t(otu.clr.table), ncomp = 3, center = TRUE, scale = FALSE, keepX = c(10, 10, 10)) 
## 
##  sparse pCA with 3 principal components. 
##  You entered data X of dimensions: 344 7103 
##  Selection of 10 10 10 variables on each of the principal components on the X data set. 
##  Main numerical outputs: 
##  -------------------- 
##  loading vectors: see object$rotation 
##  principal components: see object$x 
##  cumulative explained variance: see object$varX 
##  variable names: see object$names 
## 
##  Other functions: 
##  -------------------- 
##  selectVar, tune

mixOmics has some native graphing capabilities, which are ok.

group.labels <- mdf[row.names(d.spca$x),]$site

plotIndiv(d.spca, group=group.labels, ind.names = TRUE, legend=TRUE)

We can get a little fancier:

group.labels <- paste(mdf[row.names(d.spca$x),]$community,mdf[row.names(d.spca$x),]$type)
plotIndiv(d.spca, comp=1:2, group=group.labels, ind.names=TRUE, ellipse=TRUE, legend=TRUE)

You can also extract the relevant data and plot it with ggplot.

PC1.label <- paste("PC1:",percent(d.spca$explained_variance[1], accuracy=0.1))
PC2.label <- paste("PC2:",percent(d.spca$explained_variance[2], accuracy=0.1))
PC3.label <- paste("PC3:",percent(d.spca$explained_variance[3], accuracy=0.1))

pca.points <- data.frame(d.spca$x)
pca.points$site <- mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- mdf[as.vector(row.names(pca.points)),"date"]

#make a month variable
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))

ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).

I think it’s now more obvious that while the amount of variance accounted for is similar in the first two axes, the separation is more clear between samples.

ggplot(pca.points, aes(x=PC2, y=PC3, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC3.label) + xlab(PC2.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).

The third axis in this version also has a new quality where Phandolin and Neverwinter residential samples are cleanly separated. Neat.

We can also add stats ellipses to our plots. Read up on those before you use them and make sure you’re using a statistic that makes sense.

ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8)) + stat_ellipse(type="norm")
## Too few points to calculate an ellipse
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).

Because we’ve already restricted our input matrix to 10 OTUs on each axis, analyzing OTU contribution is a little easier.

Let’s look at what OTUs our SPCA kept.

spca.rotation <- data.frame(d.spca$rotation)
spca.rotation <- spca.rotation[which(spca.rotation$PC1 != 0 | spca.rotation$PC2 != 0 | spca.rotation$PC3 != 0),]
spca.rotation
tax.table[row.names(spca.rotation),]

Unfortunately, by nature, sparse PCAs make horrible biplots. But they are a good way to explore which OTUs contribute most to variation between groups.

#Give readable names to OTUs
sample.labels <- mdf[row.names(d.spca$x),"site"]

tax.labels = tax.table[row.names(spca.rotation),"Genus"]

biplot(x=d.spca$x, y=as.matrix(spca.rotation), var.axes=TRUE, col=c("black", "#56B4E9"), xlabs=sample.labels, ylabs=tax.labels, cex=0.5)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

mixOmics has some neat alternative ways of visualizing loadings.

plotLoadings(d.spca, comp=1, title='Loadings on Component 1', contrib='max', method='mean')

plotLoadings(d.spca, comp=2, title='Loadings on Component 2', contrib='max', method='mean')

plotLoadings(d.spca, comp=3, title='Loadings on Component 3', contrib='max', method='mean')

Logistic PCA

More on this later.

Redundancy Analysis

Redundancy analysis is great for connecting two large datasets together. The idea is to do some kind of dimension reduction twice, using the eigenvectors of the first PCA (or PCoA or NMDS or LDA) to create the second PCA. The function rda does all of this for you.

Preprocessing

If you’ve made technical replicates of your sequences, then more than likely your datasets are not the same size. You will need to either combine your technical replicates together, select one, or make averages. Read counts are dependent on total read depth, as we’ve discussed many many times, so if you want to combine or average, you should do that with the CLR transformed tables. The easiest route is to select one, otherwise I recommend taking the average proportional abundance value.

Here’s how to do that:

First double check that you’re not missing any metadata information that would inadvertantly lump non replicates together.

mdf %>% group_by(date, community, type, stream, site, bottle) %>% tally()

Next we’ll make a small function that tells us which sample names to average together.

mdf$index <- row.names(mdf)

OTU.averages <- mdf %>% group_by(date, community, type, stream, site, bottle) %>% group_map(~ rowMeans(data.frame(otu.clr.table[,.x$index])))

OTU.mat <- unlist(OTU.averages)

condensed.mdf <- mdf %>% group_by(date, community, type, stream, site, bottle) %>% tally()
condensed.mdf$SampleName <- paste("CS",seq(1,dim(condensed.mdf)[1]), sep="")

condensed.otu.table <- data.frame(OTU.averages)
colnames(condensed.otu.table) <- condensed.mdf$SampleName

condensed.mdf <- data.frame(condensed.mdf)
row.names(condensed.mdf) <- condensed.mdf$SampleName

condensed.otu.table

We now have the mean CLR for each set of sample replicates. The original sample data is in the object condensed.mdf.

Read in some fake chemistry data.

chemdata <- read.csv("ACS.fakechemdata.csv", row.names=1, header=TRUE)
chemdata

In this case we’re just going to give RDA our transformed OTU table and it will run both PCAs. Here’s what a PCA based only on the averaged OTUs would look like:

d.pcx <- prcomp(t(condensed.otu.table), scale=FALSE, center=TRUE)

d.mvar <- sum(d.pcx$sdev^2)
PC1.label <- paste("PC1: ", percent(sum(d.pcx$sdev[1]^2)/d.mvar, accuracy=0.1))
PC2.label <- paste("PC2: ", percent(sum(d.pcx$sdev[2]^2)/d.mvar, accuracy=0.1))
PC3.label <- paste("PC3: ", percent(sum(d.pcx$sdev[3]^2)/d.mvar, accuracy=0.1))

pca.points <- data.frame(d.pcx$x)
pca.points$site <- condensed.mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- condensed.mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- condensed.mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- condensed.mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- condensed.mdf[as.vector(row.names(pca.points)),"date"]
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))

ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).

acs.rda <- rda(t(condensed.otu.table) ~., data=chemdata, scale=T)
acs.rda
## Call: rda(formula = t(condensed.otu.table) ~ Cl + SO4 + NO3 + PO4 + Fe
## + Mg + Ca + TP + OP + TN + NH4 + Temp + DO + Cond + pH, data =
## chemdata, scale = T)
## 
##                 Inertia Proportion Rank
## Total         7103.0000     1.0000     
## Constrained   1067.6241     0.1503   15
## Unconstrained 6035.3759     0.8497  106
## Inertia is correlations 
## 
## Eigenvalues for constrained axes:
##   RDA1   RDA2   RDA3   RDA4   RDA5   RDA6   RDA7   RDA8   RDA9  RDA10  RDA11 
## 263.96 174.89  94.43  75.88  65.45  53.64  52.84  45.52  43.79  40.68  36.95 
##  RDA12  RDA13  RDA14  RDA15 
##  34.66  31.01  27.57  26.36 
## 
## Eigenvalues for unconstrained axes:
##   PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8 
## 825.2 515.0 350.7 273.0 205.8 187.8 152.4 145.0 
## (Showing 8 of 106 unconstrained eigenvalues)

You can also run rda using an explicit formula.

acs.rda <- rda(t(condensed.otu.table) ~ TN + OP + PO4 + NO3 + NH4, data=chemdata, scale=T)
acs.rda
## Call: rda(formula = t(condensed.otu.table) ~ TN + OP + PO4 + NO3 + NH4,
## data = chemdata, scale = T)
## 
##                 Inertia Proportion Rank
## Total         7.103e+03  1.000e+00     
## Constrained   3.206e+02  4.514e-02    5
## Unconstrained 6.782e+03  9.549e-01  116
## Inertia is correlations 
## 
## Eigenvalues for constrained axes:
##  RDA1  RDA2  RDA3  RDA4  RDA5 
## 93.18 82.33 59.51 48.50 37.09 
## 
## Eigenvalues for unconstrained axes:
##   PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8 
## 998.3 599.2 383.4 299.8 224.0 202.7 156.6 151.7 
## (Showing 8 of 116 unconstrained eigenvalues)

The constrained axes can account for only 5% of variance.

The unconstrained axes account for a lot, but not really that much in the first 3 axes.

plot(acs.rda, type='n', scaling=0)
orditorp(acs.rda, display='sp', cex=0.75, scaling=0, col='blue')
text(acs.rda, display='cn', col='red', scaling=0)

plot(acs.rda, type='n', scaling=3)
orditorp(acs.rda, display='sites', cex=0.75, scaling=3, col='blue')
text(acs.rda, display='cn', col='red', scaling=3)

The first plot is arguably the most useful since it displays OTUs alongside the chemistry data.

You can approach deeper analysis of what OTUs contribute most to principle components using the same methods we used for PCAs.

These are the constrained axis sample transformations.

data.frame(acs.rda$CCA$u)

This is the constrained axis rotational matrix for OTUs.

data.frame(acs.rda$CCA$v)

And here are the relevant contribution of each chemistry variable to each constrained axis.

data.frame(acs.rda$CCA$biplot)

If you’re interested in the PCA that’s created after redundancy analysis, meaning the variation in your dataset that can not be explained by variation in chemistry data, you can extract that like this:

sample.projections <- data.frame(acs.rda$CA$u)
otu.rotations <- data.frame(acs.rda$CA$v)
eigenvectors <- acs.rda$CA$eig
PC1.label <- percent(eigenvectors["PC1"]/sum(eigenvectors), accuracy=0.1)
PC2.label <- percent(eigenvectors["PC2"]/sum(eigenvectors), accuracy=0.1)
pca.points <- sample.projections
pca.points$site <- condensed.mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <-  condensed.mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- condensed.mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- condensed.mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- condensed.mdf[as.vector(row.names(pca.points)),"date"]

#make a month variable
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))

ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).

Principle Coordinates of Analysis (PCoA)

Nonmetric MultiDimensional Scaling (NMDS)

Distance Based Visualizations

Linear Discriminant Analysis